types <-c("clim", "geol", "soil", "topo", "vege", "hydro")# Where the files live online ...remote_files <-glue('{root}/camels_{types}.txt')# where we want to download the data ...local_files <-glue('data/camels_{types}.txt')walk2(remote_files, local_files, download.file, quiet =TRUE)# Read and merge datacamels <-map(local_files, read_delim, show_col_types =FALSE) camels <-power_full_join(camels ,by ='gauge_id')
Question 1: Your Turn
# making a map of the sitesggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +borders("state", colour ="gray50") +geom_point(aes(color = q_mean)) +scale_color_gradient(low ="pink", high ="dodgerblue") + ggthemes::theme_map()
“zero_q_freq” represents the frequency of days with Q = 0 mm/day, reported as a percentage, where Q is daily discharge.
Question 2: Aridity and P_mean Maps
aridity_map <-ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +borders("state", colour ="black") +geom_point(aes(color = aridity)) +scale_color_gradient(low ="aquamarine", high ="darkorchid") +labs(x ="Longitude", y ="Latitude", title ="Aridity Gradient Across the U.S.") + ggthemes::theme_map()p_mean_map <-ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +borders("state", colour ="black") +geom_point(aes(color = p_mean)) +scale_color_gradient(low ="bisque2", high ="darkolivegreen") +labs(x ="Longitude", y ="Latitude", title ="Mean Daily Precipitation Across the U.S.") + ggthemes::theme_map()sites_map <- ggpubr::ggarrange(aridity_map, p_mean_map, ncol =2)print(sites_map)
# rainfall and mean flow have strong correlation, inverse correlation with aridity and rainfall
Visual EDA
# Create a scatter plot of aridity vs rainfallggplot(camels, aes(x = aridity, y = p_mean)) +# Add points colored by mean flowgeom_point(aes(color = q_mean)) +# Add a linear regression linegeom_smooth(method ="lm", color ="red", linetype =2) +# Apply the viridis color scalescale_color_viridis_c() +# Add a title, axis labels, and theme (w/ legend on the bottom)theme_linedraw() +theme(legend.position ="bottom") +labs(title ="Aridity vs Rainfall vs Runnoff", x ="Aridity", y ="Rainfall",color ="Mean Flow")
`geom_smooth()` using formula = 'y ~ x'
# exponential decay, NOT linear... soggplot(camels, aes(x = aridity, y = p_mean)) +geom_point(aes(color = q_mean)) +geom_smooth(method ="lm") +scale_color_viridis_c() +# Apply log transformations to the x and y axesscale_x_log10() +scale_y_log10() +theme_linedraw() +theme(legend.position ="bottom") +labs(title ="Aridity vs Rainfall vs Runnoff", x ="Aridity", y ="Rainfall",color ="Mean Flow")
`geom_smooth()` using formula = 'y ~ x'
# this one shows a more linear log-log relationship betweeen aridity and rainfallggplot(camels, aes(x = aridity, y = p_mean)) +geom_point(aes(color = q_mean)) +geom_smooth(method ="lm") +# Apply a log transformation to the color scalescale_color_viridis_c(trans ="log") +scale_x_log10() +scale_y_log10() +theme_linedraw() +theme(legend.position ="bottom",# Expand the legend width ...legend.key.width =unit(2.5, "cm"),legend.key.height =unit(.5, "cm")) +labs(title ="Aridity vs Rainfall vs Runnoff", x ="Aridity", y ="Rainfall",color ="Mean Flow")
`geom_smooth()` using formula = 'y ~ x'
Model Building
Lets start by splitting the data
set.seed(123)# Bad form to perform simple transformations on the outcome variable within a recipe. So, we'll do it here.camels <- camels |>mutate(logQmean =log(q_mean))# Generate the splitcamels_split <-initial_split(camels, prop =0.8)camels_train <-training(camels_split)camels_test <-testing(camels_split)camels_cv <-vfold_cv(camels_train, v =10)
Preprocessor: Recipe
# Create a recipe to preprocess the datarec <-recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%# Log transform the predictor variables (aridity and p_mean)step_log(all_predictors()) %>%# Add an interaction term between aridity and p_meanstep_interact(terms =~ aridity:p_mean) |># Drop any rows with missing values in the predstep_naomit(all_predictors(), all_outcomes())
Naive base lm approach:
# Prepare the databaked_data <-prep(rec, camels_train) |>bake(new_data =NULL)# Interaction with lm# Base lm sets interaction terms with the * symbollm_base <-lm(logQmean ~ aridity * p_mean, data = baked_data)summary(lm_base)
Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)
Residuals:
Min 1Q Median 3Q Max
-2.91162 -0.21601 -0.00716 0.21230 2.85706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77586 0.16365 -10.852 < 2e-16 ***
aridity -0.88397 0.16145 -5.475 6.75e-08 ***
p_mean 1.48438 0.15511 9.570 < 2e-16 ***
aridity:p_mean 0.10484 0.07198 1.457 0.146
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared: 0.7697, Adjusted R-squared: 0.7684
F-statistic: 591.6 on 3 and 531 DF, p-value: < 2.2e-16
# Check the recipe# Sanity Interaction term from recipe ... these should be equal!!summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))
Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean,
data = baked_data)
Residuals:
Min 1Q Median 3Q Max
-2.91162 -0.21601 -0.00716 0.21230 2.85706
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.77586 0.16365 -10.852 < 2e-16 ***
aridity -0.88397 0.16145 -5.475 6.75e-08 ***
p_mean 1.48438 0.15511 9.570 < 2e-16 ***
aridity_x_p_mean 0.10484 0.07198 1.457 0.146
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared: 0.7697, Adjusted R-squared: 0.7684
F-statistic: 591.6 on 3 and 531 DF, p-value: < 2.2e-16
Correct way to evaluate the model on test data: prep -> bake -> predict
metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.583
2 rsq standard 0.742
3 mae standard 0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +# Apply a gradient color scalescale_color_gradient2(low ="brown", mid ="orange", high ="darkgreen") +geom_point() +geom_abline(linetype =2) +theme_linedraw() +labs(title ="Linear Model: Observed vs Predicted",x ="Observed Log Mean Flow",y ="Predicted Log Mean Flow",color ="Aridity")
Using a workflow instead
# Define modellm_model <-linear_reg() %>%# define the engineset_engine("lm") %>%# define the modeset_mode("regression")# Instantiate a workflow ...lm_wf <-workflow() %>%# Add the recipeadd_recipe(rec) %>%# Add the modeladd_model(lm_model) %>%# Fit the model to the training datafit(data = camels_train) # Extract the model coefficients from the workflowsummary(extract_fit_engine(lm_wf))$coefficients
# random forest model to predict mean streamflowlibrary(baguette)rf_model <-rand_forest() %>%set_engine("ranger", importance ="impurity") %>%set_mode("regression")rf_wf <-workflow() %>%# Add the recipeadd_recipe(rec) %>%# Add the modeladd_model(rf_model) %>%# Fit the modelfit(data = camels_train) # make predictions on the test datarf_data <-augment(rf_wf, new_data = camels_test)dim(rf_data)
[1] 135 60
Model Evaluation: Statistical and Visual
# create a scatter plot of the observed vs predicted values, colored by ariditymetrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.592
2 rsq standard 0.736
3 mae standard 0.367
# this compared multiple models by defining a set of workflows, fit them to the same data, and evaluate their performance using a common metricwf <-workflow_set(list(rec), list(lm_model, rf_model)) %>%workflow_map('fit_resamples', resamples = camels_cv) autoplot(wf)
# build a xgboost regression model using boost_treexgb_mod <-boost_tree(mode ="regression",trees =1000) |>set_engine('xgboost')# build a neural network model using the nnet engine fron the baguette package using the bag_mlp functionnn_mod <-bag_mlp() %>%set_mode("regression") %>%set_engine('nnet')# add to the above workflowxgb_workflow <-workflow() %>%add_recipe(rec) %>%add_model(xgb_mod) %>%fit(data = camels_train) %>%augment(camels_train)nn_workflow <-workflow() %>%add_recipe(rec) %>%add_model(nn_mod) %>%fit(data = camels_train) %>%augment(camels_train) # evaluate the model and compare it to the linear and random forest modelsmetrics(xgb_workflow, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.00292
2 rsq standard 1.00
3 mae standard 0.00211
metrics(nn_workflow, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rmse standard 0.478
2 rsq standard 0.837
3 mae standard 0.299
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
autoplot(wf)
Out of the linear regression, random forest, boosted tree, and neural network models, I would move forward with the xgboost model. This is because the results are right on the line of best fit (1:1), meaning the metrics have more significance and are less likely to have residual errors.
Question 4: Build your own
Data Splitting
# set a new seed for reproducibleset.seed(123456) # new sequence# create initial 75% training and 25% testing split and extract the setscamels_strata <-initial_split(camels, prop = .75)train_camels <-training(camels_strata)test_camels <-testing(camels_strata)# build a 10-fold CV datasetcamels_folds <-vfold_cv(train_camels, v =10)
Recipe
# define a formula you want to useformula <- logQmean ~ p_mean + aridity + high_prec_dur# build a recipetrain_camels <-na.omit(train_camels)rec_camels <-recipe(logQmean ~ p_mean + aridity + high_prec_dur, data = train_camels) %>%step_log(all_predictors()) %>%step_interact(terms =~ aridity:p_mean) %>%step_naomit(all_predictors(), all_outcomes()) %>%step_zv(all_predictors()) # prep the databaked_camels <-prep(rec_camels, train_camels) %>%bake(new_data =NULL)# check the recipe (should be zero)sum(is.na(baked_camels))
[1] 0
sum(is.infinite(as.matrix(baked_camels)))
[1] 0
The formula was chose was based on the inclusion of the predictor variables that I believe influence mean daily discharge: p_mean, aridity, and logQmean. Precipitation adds water to the system, while aridity indicates how dry an area is (drier areas usually have lower logQmean). I also expect that more frequent heavy rain events (high_prec_dur) will lead to higher average discharge.
Define 3 Models
# define a random forest modelq4_rf_mod <-rand_forest() %>%set_engine("ranger") %>%set_mode("regression")# 2 other models of choiceq4_xgb_mod <-boost_tree() %>%set_engine("xgboost") %>%set_mode("regression") q4_lm_mod <-linear_reg() %>%set_engine("lm") %>%set_mode("regression")
Workflow Set
# create workflow objects, add the recipe, add the modelsq4_rf_wf <-workflow() %>%add_recipe(rec_camels) %>%add_model(q4_rf_mod)q4_xgb_wf <-workflow() %>%add_recipe(rec_camels) %>%add_model(q4_xgb_mod)q4_lm_wf <-workflow() %>%add_recipe(rec_camels) %>%add_model(q4_lm_mod) # fit the model to the resamplesrf_results <-fit_resamples(q4_rf_wf, resamples = camels_folds)xgb_results <-fit_resamples(q4_xgb_wf, resamples = camels_folds) lm_results <-fit_resamples(q4_lm_wf, resamples = camels_folds)
Evaluation
# use autoplot and rank_results to compare the modelsq4_wf <-workflow_set(list(rec_camels), list(q4_rf_mod, q4_xgb_mod, q4_lm_mod)) %>%workflow_map('fit_resamples', resamples = camels_cv)autoplot(q4_wf)
Out of the random forest, linear, and xgboost model, I think that the random forest model is the strongest performer because it shows the lowest RMSE and the highest RSQ compared to the others, capturing the relationship between predictors and response very well. These metrics indicate that the random forest’s predictions are close to the actual values and explain a large variance in the data. This means it consistently makes accurate predictions and captures the underlying patterns in the dataset better than the other models.
Extract and Evaluate
# build a workflow with favorite model, recipe, and training datafinal_wf <-workflow() %>%add_recipe(rec_camels) %>%add_model(q4_rf_mod) %>%fit(data = train_camels) # use fit to fit all training data to the model# use augment to make preditions on the test datafinal_wf_data <-augment(final_wf, new_data = test_camels)#create a plot of the observed vs predicted valuesggplot(final_wf_data, aes(x = .pred, y = logQmean, colour = logQmean)) +scale_color_gradient2(low ="blue3", mid ="yellow", high ="chartreuse") +geom_point() +geom_abline(linetype =2) +labs(title ="Random Forest Model: Observed vs. Predicted",x ="Predicted Log Mean Flow",y ="Observed Log Mean Flow?")
These results suggest that the random forest model performs well on predicting logQmean based on the predictors chosen. Most of the points are along the 1:1 line, indicating that the model’s predicted values match the observed values closely, accurately capturing the relationship between the predictors and logQmean effectively.